# Computations
import pandas as pd
import numpy as np
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex
## seaborn
import seaborn as sns
sns.set_context('paper', rc={'font.size':12,'axes.titlesize':14,'axes.labelsize':12})
sns.set_style('white')
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = 14, 8
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
from plotly import tools
import plotly.express as px
import plotly.figure_factory as ff
# Graphics in retina format
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')
In this article, we analyze the UCI Statlog (german credit data) from Kaggle.com.
The original dataset contains 1000 entries with 20 categorial/symbolic attributes prepared by Prof. Hofmann. In this dataset, each entry represents a person who takes a credit by a bank. Each person is classified as good or bad credit risks according to the set of attributes. The link to the original dataset can be found below.
It is almost impossible to understand the original dataset due to its complicated system of categories and symbols. Thus, I wrote a small Python script to convert it into a readable CSV file. Several columns are simply ignored, because in my opinion either they are not important or their descriptions are obscure. The selected attributes are:
Data = pd.read_csv('Data/german_credit_data.csv', index_col=0)
def Data_info(Inp, Only_NaN = False):
Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
#
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'The Dataset:')
display(Data.head())
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Nan Values:')
display(Data_info(Data))
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Dataset Shape:')
display(pd.DataFrame([Data.shape], columns = ['Instances','Attributes'],index = ['Dataset']))
Data['Sex'] = Data['Sex'].map(lambda x: x.title())
Data['Housing'] = Data['Housing'].map(lambda x: x.title())
Data['Checking account'] = Data['Checking account'].fillna('None')
Data['Checking account'] = Data['Checking account'].map(lambda x: x.title())
Data['Saving accounts'] = Data['Saving accounts'].fillna('None')
Data['Saving accounts'] = Data['Saving accounts'].map(lambda x: x.title())
Data['Purpose'] = Data['Purpose'].map(lambda x: x.title())
Data['Purpose'] = Data['Purpose'].replace({'Radio/Tv':'Radio/TV'})
Data['Risk'] = Data['Risk'].map(lambda x: x.title())
Data.columns = [x.title() for x in Data.columns]
Data.head()
Let's see how the Risk feature is distributed across the dataset.
# Good and Bad Colormap
GB_Colors = ['LightCoral', 'LimeGreen']
GB_LC = 'Black'
# Male and Female Colormap
MF_Colors = ['HotPink', 'RoyalBlue']
MF_LC = 'Navy'
#
Temp = Data.groupby(['Risk'])['Risk'].agg({'count'}).rename(columns = {'count':'Count'}).reset_index(drop = False)
Temp['Percentage'] = np.round(100* Temp['Count'].values /Temp['Count'].sum(), 2)
display(Temp.style.hide_index().set_precision(2))
fig = go.Figure(data=[go.Bar(x = Temp['Percentage'], y = Temp['Risk'], text = Temp['Percentage'],
marker_color= GB_Colors, orientation='h')])
fig.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2}%', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 80])
fig.update_layout(plot_bgcolor= 'white', height= 260)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_layout(title={'text': 'Risk Distribution',
'x':0.46, 'y':0.75,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
fig = px.histogram(Data, x = 'Credit Amount', color='Risk', marginal= 'box', color_discrete_sequence= GB_Colors,
hover_data=['Credit Amount', 'Risk'])
fig.update_layout(title = 'Credit Distribution Histogram', plot_bgcolor= 'white')
fig.update_traces(marker_line_color= GB_LC, marker_line_width=0.5, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.show()
Temp = Data.groupby(['Sex','Risk'])['Risk'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
display(Temp)
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Sex', x= 'Percentage', orientation='h',
color = 'Risk', text = 'Percentage', color_discrete_sequence= GB_Colors, height= 220)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2}%', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 80])
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_layout(title={'text': 'Gender Distribution by Risk',
'x':0.46, 'y':0.95,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Temp = Data.groupby(['Age','Risk'])['Age'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
Temp.loc[Temp.Risk == 'Bad', 'Percentage'] = -Temp.loc[Temp.Risk == 'Bad', 'Percentage']
fig = px.bar(Temp, x= 'Age', y= 'Percentage', color = 'Risk', hover_data= ['Risk', 'Count'],
color_discrete_sequence= GB_Colors, height= 500)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=0.5, opacity=1)
fig['layout']['yaxis'].update(range=[-3, 4])
fig.update_layout(yaxis = dict(tickmode = 'array', tickvals = np.arange(-4,5), ticktext = np.abs(np.arange(-4,5))))
fig.update_traces(marker_line_color= 'Navy', marker_line_width=1.2, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(legend_orientation='h', plot_bgcolor= 'white')
fig.update_layout(title={'text': 'Age Distribution by Risk',
'x':0.5, 'y':0.95,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Temp = Data[['Age','Risk','Sex']]
fig = px.box(Data, x='Sex', y='Age', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig.update_traces(quartilemethod='linear')
fig.update_layout(height= 500, width = 600)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[10, 80])
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Age Distribution by Gender and Risk',
'x':0.47, 'y':0.95,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Creating new features:
We can create Age Categories using statcan.gc.ca.
| Interval | Age Category |
|---|---|
| 00-14 years | Children |
| 15-24 years | Youth |
| 25-64 years | Adults |
| 65 years and over | Seniors |
if Data.Age.min() < 14:
bins = pd.IntervalIndex.from_tuples([(0, 14), (14, 24), (24, 64),(64, Data.Age.max())])
else:
bins = pd.IntervalIndex.from_tuples([(14, 24), (24, 64),(64, Data.Age.max())])
Data['Age Group'] = pd.cut(Data['Age'], bins)
Data['Age Category'] = Data['Age Group'].astype(str).replace({'(14, 24]':'Youth', '(24, 64]':'Adults','(64, 75]':'Seniors'})
Data.head()
Temp = Data.groupby(['Sex','Age Group','Age Category','Risk'])['Risk'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp = Temp[(Temp.T != 0).any()]
display(Temp)
Temp.reset_index(drop = False, inplace = True)
Temp = Temp.sort_values(by=['Age Group'])
# Figures
fig = make_subplots(rows=2, cols=1, vertical_spacing = 0.05, shared_xaxes=True, subplot_titles=('Risk: Good', 'Risk: Bad'))
# Top
fig1 = px.bar(Temp.loc[Temp.Risk == 'Good'], y= 'Age Category', x= 'Percentage', orientation='h',
color = 'Sex', text = 'Percentage', hover_data= Temp.columns,
color_discrete_sequence = MF_Colors, height= 400)
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, row=1, col=1)
# Bottom
fig2 = px.bar(Temp.loc[Temp.Risk == 'Bad'], y= 'Age Category', x= 'Percentage', orientation='h',
color = 'Sex', text = 'Percentage', hover_data= Temp.columns,
color_discrete_sequence = MF_Colors, height= 400)
fig.add_trace(fig2['data'][0], row=2, col=1)
fig.add_trace(fig2['data'][1], row=2, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, showlegend = False, row=2, col=1)
# Update
fig.update_layout(height= 600)
fig.update_layout(plot_bgcolor= 'white', legend_orientation='h')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_traces(texttemplate='%{text:.2}%', textposition='outside')
fig.update_xaxes(title_text='Percent', range=[0, 50], row=2, col=1)
fig.update_yaxes(title_text='Age Group', row=1, col=1)
fig.update_yaxes(title_text='Age Group', row=2, col=1)
fig.update_layout(title={'text': 'Age Group Distribution by Gender and Risk',
'x':0.50, 'y':0.92,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
A box plot is a statistical representation of numerical data through their quartiles. The ends of the box represent the lower and upper quartiles, while the median (second quartile) is marked by a line inside the box.
fig = px.box(Data, x='Age Category', y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig.update_traces(quartilemethod='linear')
fig.update_layout(height= 500, width = 600)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[0, 20e3])
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Age Group and Risk',
'x':0.48, 'y':0.95,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Temp = Data[['Risk','Job']]
Temp['Job'] = Temp['Job'].map({0: 'Unskilled and Non-Resident',
1: 'Unskilled and Resident',
2: 'Skilled',
3: 'Highly Skilled'})
Temp = Temp.groupby(['Risk','Job'])['Job'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
display(Temp)
Temp.reset_index(inplace = True, drop = False)
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Job', x= 'Percentage', orientation='h', hover_data= Temp.columns,
color = 'Risk', text = 'Percentage', color_discrete_sequence= GB_Colors, height= 300)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2}%', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 70])
fig.update_layout(legend_orientation='h', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_layout(title={'text': 'Job Distribution by Risk',
'x':0.56, 'y':0.95,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Temp = Data.groupby(['Risk','Housing'])['Risk'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
display(Temp)
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Housing', x= 'Percentage', orientation='h', hover_data= Temp.columns,
color = 'Risk', text = 'Percentage', color_discrete_sequence= GB_Colors, height= 280)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2}%', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 80])
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_layout(title={'text': 'Housing Distribution by Risk',
'x':0.5, 'y':0.9,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
A violin plot is a statistical representation of numerical data. It is similar to a box plot, with the addition of a rotated kernel density plot on each side. The ends of the box represent the lower and upper quartiles, while the median (second quartile) is marked by a line inside the box. Moreover, the data distribution in each case can be seen as well.
fig = px.violin(Data, x='Housing', y='Credit Amount', color='Risk', box=True,
hover_data=['Housing','Credit Amount','Risk'], color_discrete_sequence= GB_Colors[::-1])
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[-5e3, 25e3])
fig.update_yaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Housing and Risk',
'x':0.46, 'y':0.95,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Temp = Data[['Risk','Job','Credit Amount']]
Temp['Job'] = Temp['Job'].map({0: 'Unskilled and Non-Resident',
1: 'Unskilled and Resident',
2: 'Skilled',
3: 'Highly Skilled'})
fig = px.violin(Temp, x='Job', y='Credit Amount', color='Risk', box=True,
hover_data=['Job','Credit Amount','Risk'], color_discrete_sequence= GB_Colors[::-1])
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[-5e3, 25e3])
fig.update_yaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Job and Risk',
'x':0.46, 'y':0.95,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
fig = px.box(Data, x='Sex', y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig.update_traces(quartilemethod='linear')
fig.update_layout(height= 500, width = 500, plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[0, 20e3])
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Gender',
'x':0.48, 'y':0.95,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Feat = 'Housing'
Temp = Data[[Feat,'Risk','Credit Amount']]
fig = make_subplots(rows=1, cols=2, shared_xaxes=True,
subplot_titles=('Credit Distribution by %s and Risk' % Feat,
'Average Credit by %s and Risk' % Feat))
# Left
fig1 = px.box(Temp, x=Feat, y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig1.update_traces(quartilemethod='linear')
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=.2, opacity=1, showlegend = False, row=1, col=1)
# Right
Temp = Temp.groupby([Feat,'Risk'])['Credit Amount'].agg({'mean'}).rename(columns = {'mean':'Average'}).round(2)
display(Temp)
Temp.reset_index(drop = False, inplace = True)
fig2 = px.bar(Temp, x= Feat, y= 'Average', orientation='v',
color = 'Risk', text = 'Average', barmode='group', color_discrete_sequence= GB_Colors, height= 400)
fig2.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig2['data'][1], row=1, col=2)
# Updates
fig.update_layout(boxmode='group')
fig.update_yaxes(range=[0, 20e3], row=1, col=1)
fig.update_yaxes(title_text='Average', range=[0, 6e3], row=1, col=2)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Housing and Risk',
'x':0.46, 'y':0.9,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Feat = 'Job'
Temp = Data[[Feat,'Risk','Credit Amount']]
Temp['Job'] = Temp['Job'].map({0: 'Unskilled and Non-Resident',
1: 'Unskilled and Resident',
2: 'Skilled',
3: 'highly skilled'})
fig = make_subplots(rows=1, cols=2, shared_xaxes=True,
subplot_titles=('Credit Distribution by %s and Risk' % Feat,
'Average Credit by %s and Risk' % Feat))
# Left
fig1 = px.box(Temp, x=Feat, y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig1.update_traces(quartilemethod='linear')
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=.2, opacity=1, showlegend = False, row=1, col=1)
# Right
Temp = Temp.groupby([Feat,'Risk'])['Credit Amount'].agg({'mean'}).rename(columns = {'mean':'Average'}).round(2)
display(Temp)
Temp.reset_index(drop = False, inplace = True)
fig2 = px.bar(Temp, x= Feat, y= 'Average', orientation='v',
color = 'Risk', text = 'Average', barmode='group', color_discrete_sequence= GB_Colors, height= 400)
fig2.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig2['data'][1], row=1, col=2)
# Updates
fig.update_layout(boxmode='group')
fig.update_yaxes(range=[0, 20e3], row=1, col=1)
fig.update_yaxes(title_text='Average', range=[0, 7e3], row=1, col=2)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Job and Risk',
'x':0.46, 'y':0.9,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Feat = 'Saving Accounts'
Temp = Data[[Feat,'Risk','Credit Amount']]
fig = make_subplots(rows=1, cols=2, shared_xaxes=True,
subplot_titles=('Credit Distribution by %s and Risk' % Feat,
'Average Credit by %s and Risk' % Feat))
# Left
fig1 = px.box(Temp, x=Feat, y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig1.update_traces(quartilemethod='linear')
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=.2, opacity=1, showlegend = False, row=1, col=1)
# Right
Temp = Temp.groupby([Feat,'Risk'])['Credit Amount'].agg({'mean'}).rename(columns = {'mean':'Average'}).round(2)
display(Temp)
Temp.reset_index(drop = False, inplace = True)
fig2 = px.bar(Temp, x= Feat, y= 'Average', orientation='v',
color = 'Risk', text = 'Average', barmode='group', color_discrete_sequence= GB_Colors, height= 500)
fig2.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig2['data'][1], row=1, col=2)
# Updates
fig.update_layout(boxmode='group')
fig.update_yaxes(range=[0, 20e3], row=1, col=1)
fig.update_yaxes(title_text='Average', range=[0, 5e3], row=1, col=2)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Saving Accounts and Risk',
'x':0.46, 'y':0.9,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Feat = 'Checking Account'
Temp = Data[[Feat,'Risk','Credit Amount']]
fig = make_subplots(rows=1, cols=2, shared_xaxes=True,
subplot_titles=('Credit Distribution by %s and Risk' % Feat,
'Average Credit by %s and Risk' % Feat))
# Left
fig1 = px.box(Temp, x=Feat, y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig1.update_traces(quartilemethod='linear')
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=.2, opacity=1, showlegend = False, row=1, col=1)
# Right
Temp = Temp.groupby([Feat,'Risk'])['Credit Amount'].agg({'mean'}).rename(columns = {'mean':'Average'}).round(2)
display(Temp)
Temp.reset_index(drop = False, inplace = True)
fig2 = px.bar(Temp, x= Feat, y= 'Average', orientation='v',
color = 'Risk', text = 'Average', barmode='group', color_discrete_sequence= GB_Colors, height= 400)
fig2.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig2['data'][1], row=1, col=2)
# Updates
fig.update_layout(boxmode='group')
fig.update_yaxes(range=[0, 20e3], row=1, col=1)
fig.update_yaxes(title_text='Average', range=[0, 5e3], row=1, col=2)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Checking Account and Risk',
'x':0.46, 'y':0.9,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Feat = 'Purpose'
Temp = Data[[Feat,'Risk','Credit Amount']]
fig = make_subplots(rows=1, cols=2, shared_xaxes=True,
subplot_titles=('Credit Distribution by %s and Risk' % Feat,
'Average Credit by %s and Risk' % Feat))
# Left
fig1 = px.box(Temp, x=Feat, y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig1.update_traces(quartilemethod='linear')
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=.2, opacity=1, showlegend = False, row=1, col=1)
# Right
Temp = Temp.groupby([Feat,'Risk'])['Credit Amount'].agg({'mean'}).rename(columns = {'mean':'Average'}).round(2)
display(Temp)
Temp.reset_index(drop = False, inplace = True)
fig2 = px.bar(Temp, x= Feat, y= 'Average', orientation='v',
color = 'Risk', text = 'Average', barmode='group', color_discrete_sequence= GB_Colors, height= 350)
fig2.update_traces(marker_line_color= GB_LC, marker_line_width=1, opacity=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig2['data'][1], row=1, col=2)
# Updates
fig.update_layout(boxmode='group')
fig.update_yaxes(range=[0, 20e3], row=1, col=1)
fig.update_yaxes(title_text='Average', range=[0, 12e3], row=1, col=2)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Purpose and Risk',
'x':0.46, 'y':0.9,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Temp = Data[['Sex','Credit Amount','Risk']]
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.05, shared_yaxes=True, subplot_titles=('Good Risk', 'Bad Risk'))
# Left
fig1 = px.box(Temp.loc[Temp.Risk == 'Good'], x= 'Sex', y='Credit Amount', hover_data= Temp.columns,
color='Sex', color_discrete_sequence= MF_Colors[::-1])
fig1.update_traces(quartilemethod='linear')
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig1['data'][1], row=1, col=1)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=.2, opacity=1, showlegend = True, row=1, col=1)
# Right
fig2 = px.box(Temp.loc[Temp.Risk == 'Bad'], x= 'Sex', y='Credit Amount', hover_data= Temp.columns,
color='Sex', color_discrete_sequence= MF_Colors)
fig2.update_traces(quartilemethod='linear')
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig2['data'][1], row=1, col=2)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=.2, opacity=1, showlegend = False, row=1, col=2)
# Updates
fig.update_layout(boxmode='group', plot_bgcolor= 'white', width= 600)
fig.update_yaxes(range=[0, 20e3])
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Risk and Gender',
'x':0.46, 'y':0.9,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Temp = Data.groupby(['Duration','Risk'])['Risk'].agg({'count'}).rename(columns = {'count':'Count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, x= 'Duration', y= 'Count', color = 'Risk', hover_data= ['Risk', 'Count'],
barmode='group',
color_discrete_sequence= GB_Colors[::-1], height= 500)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=0.5, opacity=1)
fig['layout']['yaxis'].update(range=[0, 140])
fig.update_traces(marker_line_color= 'Navy', marker_line_width=1.2, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(legend_orientation='h', plot_bgcolor= 'white')
fig.update_layout(title={'text': 'Duration Distribution by Risk',
'x':0.5, 'y':0.94,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
fig = ff.create_distplot([Data.loc[Data.Risk == 'Good', 'Duration'],
Data.loc[Data.Risk == 'Bad', 'Duration']], ['Good', 'Bad'], colors= GB_Colors[::-1],
show_rug=False, bin_size= 1.5)
fig.update_traces(marker_line_color= GB_LC, marker_line_width=0.5, opacity= 0.7)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[0, 0.14])
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Duration Distplot',
'x':0.5, 'y':0.90,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
fig = px.box(Data, x='Duration', y='Credit Amount', color='Risk', color_discrete_sequence= GB_Colors[::-1])
fig.update_traces(quartilemethod='linear')
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True, range=[0, 20e3])
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(title={'text': 'Credit Amount Distribution by Duration',
'x':0.46, 'y':0.95,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()